shithub: libvpx

Download patch

ref: b59b3241713282d50af2dce6ae5a1f57223ca6b2
parent: 6e6f5881d8f23cee4d21d4ae86d59363713f3bf4
parent: e272273443ceec31d586003e740af97541e9c4ce
author: Deb Mukherjee <[email protected]>
date: Thu May 22 08:30:38 EDT 2014

Merge "Renames x86_64 specific asm files"

--- a/vp8/common/x86/loopfilter_block_sse2.asm
+++ /dev/null
@@ -1,815 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro LF_ABS 2
-        ; %1 value not preserved
-        ; %2 value preserved
-        ; output in %1
-        movdqa      scratch1, %2            ; v2
-
-        psubusb     scratch1, %1            ; v2 - v1
-        psubusb     %1, %2                  ; v1 - v2
-        por         %1, scratch1            ; abs(v2 - v1)
-%endmacro
-
-%macro LF_FILTER_HEV_MASK 8-9
-
-        LF_ABS      %1, %2                  ; abs(p3 - p2)
-        LF_ABS      %2, %3                  ; abs(p2 - p1)
-        pmaxub      %1, %2                  ; accumulate mask
-%if %0 == 8
-        movdqa      scratch2, %3            ; save p1
-        LF_ABS      scratch2, %4            ; abs(p1 - p0)
-%endif
-        LF_ABS      %4, %5                  ; abs(p0 - q0)
-        LF_ABS      %5, %6                  ; abs(q0 - q1)
-%if %0 == 8
-        pmaxub      %5, scratch2            ; accumulate hev
-%else
-        pmaxub      %5, %9
-%endif
-        pmaxub      %1, %5                  ; accumulate mask
-
-        LF_ABS      %3, %6                  ; abs(p1 - q1)
-        LF_ABS      %6, %7                  ; abs(q1 - q2)
-        pmaxub      %1, %6                  ; accumulate mask
-        LF_ABS      %7, %8                  ; abs(q2 - q3)
-        pmaxub      %1, %7                  ; accumulate mask
-
-        paddusb     %4, %4                  ; 2 * abs(p0 - q0)
-        pand        %3, [GLOBAL(tfe)]
-        psrlw       %3, 1                   ; abs(p1 - q1) / 2
-        paddusb     %4, %3                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-
-        psubusb     %1, [limit]
-        psubusb     %4, [blimit]
-        por         %1, %4
-        pcmpeqb     %1, zero                ; mask
-
-        psubusb     %5, [thresh]
-        pcmpeqb     %5, zero                ; ~hev
-%endmacro
-
-%macro LF_FILTER 6
-        ; %1-%4: p1-q1
-        ; %5: mask
-        ; %6: hev
-
-        movdqa      scratch2, %6            ; save hev
-
-        pxor        %1, [GLOBAL(t80)]       ; ps1
-        pxor        %4, [GLOBAL(t80)]       ; qs1
-        movdqa      scratch1, %1
-        psubsb      scratch1, %4            ; signed_char_clamp(ps1 - qs1)
-        pandn       scratch2, scratch1      ; vp8_filter &= hev
-
-        pxor        %2, [GLOBAL(t80)]       ; ps0
-        pxor        %3, [GLOBAL(t80)]       ; qs0
-        movdqa      scratch1, %3
-        psubsb      scratch1, %2            ; qs0 - ps0
-        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
-        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
-        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
-        pand        %5, scratch2            ; &= mask
-
-        movdqa      scratch2, %5
-        paddsb      %5, [GLOBAL(t4)]        ; Filter1
-        paddsb      scratch2, [GLOBAL(t3)]  ; Filter2
-
-        ; Filter1 >> 3
-        movdqa      scratch1, zero
-        pcmpgtb     scratch1, %5
-        psrlw       %5, 3
-        pand        scratch1, [GLOBAL(te0)]
-        pand        %5, [GLOBAL(t1f)]
-        por         %5, scratch1
-
-        psubsb      %3, %5                  ; qs0 - Filter1
-        pxor        %3, [GLOBAL(t80)]
-
-        ; Filter2 >> 3
-        movdqa      scratch1, zero
-        pcmpgtb     scratch1, scratch2
-        psrlw       scratch2, 3
-        pand        scratch1, [GLOBAL(te0)]
-        pand        scratch2, [GLOBAL(t1f)]
-        por         scratch2, scratch1
-
-        paddsb      %2, scratch2            ; ps0 + Filter2
-        pxor        %2, [GLOBAL(t80)]
-
-        ; outer tap adjustments
-        paddsb      %5, [GLOBAL(t1)]
-        movdqa      scratch1, zero
-        pcmpgtb     scratch1, %5
-        psrlw       %5, 1
-        pand        scratch1, [GLOBAL(t80)]
-        pand        %5, [GLOBAL(t7f)]
-        por         %5, scratch1
-        pand        %5, %6                  ; vp8_filter &= ~hev
-
-        psubsb      %4, %5                  ; qs1 - vp8_filter
-        pxor        %4, [GLOBAL(t80)]
-
-        paddsb      %1, %5                  ; ps1 + vp8_filter
-        pxor        %1, [GLOBAL(t80)]
-%endmacro
-
-;void vp8_loop_filter_bh_y_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh
-;)
-global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
-sym(vp8_loop_filter_bh_y_sse2):
-
-%if LIBVPX_YASM_WIN64
-    %define src      rcx ; src_ptr
-    %define stride   rdx ; src_pixel_step
-    %define blimit   r8
-    %define limit    r9
-    %define thresh   r10
-
-    %define spp      rax
-    %define stride3  r11
-    %define stride5  r12
-    %define stride7  r13
-
-    push    rbp
-    mov     rbp, rsp
-    SAVE_XMM 11
-    push    r12
-    push    r13
-    mov     thresh, arg(4)
-%else
-    %define src      rdi ; src_ptr
-    %define stride   rsi ; src_pixel_step
-    %define blimit   rdx
-    %define limit    rcx
-    %define thresh   r8
-
-    %define spp      rax
-    %define stride3  r9
-    %define stride5  r10
-    %define stride7  r11
-%endif
-
-    %define scratch1 xmm5
-    %define scratch2 xmm6
-    %define zero     xmm7
-
-    %define i0       [src]
-    %define i1       [spp]
-    %define i2       [src + 2 * stride]
-    %define i3       [spp + 2 * stride]
-    %define i4       [src + 4 * stride]
-    %define i5       [spp + 4 * stride]
-    %define i6       [src + 2 * stride3]
-    %define i7       [spp + 2 * stride3]
-    %define i8       [src + 8 * stride]
-    %define i9       [spp + 8 * stride]
-    %define i10      [src + 2 * stride5]
-    %define i11      [spp + 2 * stride5]
-    %define i12      [src + 4 * stride3]
-    %define i13      [spp + 4 * stride3]
-    %define i14      [src + 2 * stride7]
-    %define i15      [spp + 2 * stride7]
-
-    ; prep work
-    lea         spp, [src + stride]
-    lea         stride3, [stride + 2 * stride]
-    lea         stride5, [stride3 + 2 * stride]
-    lea         stride7, [stride3 + 4 * stride]
-    pxor        zero, zero
-
-        ; load the first set into registers
-        movdqa       xmm0, i0
-        movdqa       xmm1, i1
-        movdqa       xmm2, i2
-        movdqa       xmm3, i3
-        movdqa       xmm4, i4
-        movdqa       xmm8, i5
-        movdqa       xmm9, i6   ; q2, will contain abs(p1-p0)
-        movdqa       xmm10, i7
-LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
-
-        movdqa       xmm1, i2
-        movdqa       xmm2, i3
-        movdqa       xmm3, i4
-        movdqa       xmm8, i5
-LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
-        movdqa       i2, xmm1
-        movdqa       i3, xmm2
-
-; second set
-        movdqa       i4, xmm3
-        movdqa       i5, xmm8
-
-        movdqa       xmm0, i6
-        movdqa       xmm1, i7
-        movdqa       xmm2, i8
-        movdqa       xmm4, i9
-        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
-        movdqa       xmm11, i11
-LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
-
-        movdqa       xmm0, i6
-        movdqa       xmm1, i7
-        movdqa       xmm4, i8
-        movdqa       xmm8, i9
-LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
-        movdqa       i6, xmm0
-        movdqa       i7, xmm1
-
-; last set
-        movdqa       i8, xmm4
-        movdqa       i9, xmm8
-
-        movdqa       xmm0, i10
-        movdqa       xmm1, i11
-        movdqa       xmm2, i12
-        movdqa       xmm3, i13
-        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
-        movdqa       xmm11, i15
-LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
-
-        movdqa       xmm0, i10
-        movdqa       xmm1, i11
-        movdqa       xmm3, i12
-        movdqa       xmm8, i13
-LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
-        movdqa       i10, xmm0
-        movdqa       i11, xmm1
-        movdqa       i12, xmm3
-        movdqa       i13, xmm8
-
-%if LIBVPX_YASM_WIN64
-    pop    r13
-    pop    r12
-    RESTORE_XMM
-    pop    rbp
-%endif
-
-    ret
-
-
-;void vp8_loop_filter_bv_y_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh
-;)
-
-global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
-sym(vp8_loop_filter_bv_y_sse2):
-
-%if LIBVPX_YASM_WIN64
-    %define src      rcx ; src_ptr
-    %define stride   rdx ; src_pixel_step
-    %define blimit   r8
-    %define limit    r9
-    %define thresh   r10
-
-    %define spp      rax
-    %define stride3  r11
-    %define stride5  r12
-    %define stride7  r13
-
-    push    rbp
-    mov     rbp, rsp
-    SAVE_XMM 15
-    push    r12
-    push    r13
-    mov     thresh, arg(4)
-%else
-    %define src      rdi
-    %define stride   rsi
-    %define blimit   rdx
-    %define limit    rcx
-    %define thresh   r8
-
-    %define spp      rax
-    %define stride3  r9
-    %define stride5  r10
-    %define stride7  r11
-%endif
-
-    %define scratch1 xmm5
-    %define scratch2 xmm6
-    %define zero     xmm7
-
-    %define s0       [src]
-    %define s1       [spp]
-    %define s2       [src + 2 * stride]
-    %define s3       [spp + 2 * stride]
-    %define s4       [src + 4 * stride]
-    %define s5       [spp + 4 * stride]
-    %define s6       [src + 2 * stride3]
-    %define s7       [spp + 2 * stride3]
-    %define s8       [src + 8 * stride]
-    %define s9       [spp + 8 * stride]
-    %define s10      [src + 2 * stride5]
-    %define s11      [spp + 2 * stride5]
-    %define s12      [src + 4 * stride3]
-    %define s13      [spp + 4 * stride3]
-    %define s14      [src + 2 * stride7]
-    %define s15      [spp + 2 * stride7]
-
-    %define i0       [rsp]
-    %define i1       [rsp + 16]
-    %define i2       [rsp + 32]
-    %define i3       [rsp + 48]
-    %define i4       [rsp + 64]
-    %define i5       [rsp + 80]
-    %define i6       [rsp + 96]
-    %define i7       [rsp + 112]
-    %define i8       [rsp + 128]
-    %define i9       [rsp + 144]
-    %define i10      [rsp + 160]
-    %define i11      [rsp + 176]
-    %define i12      [rsp + 192]
-    %define i13      [rsp + 208]
-    %define i14      [rsp + 224]
-    %define i15      [rsp + 240]
-
-    ALIGN_STACK 16, rax
-
-    ; reserve stack space
-    %define      temp_storage  0 ; size is 256 (16*16)
-    %define      stack_size 256
-    sub          rsp, stack_size
-
-    ; prep work
-    lea         spp, [src + stride]
-    lea         stride3, [stride + 2 * stride]
-    lea         stride5, [stride3 + 2 * stride]
-    lea         stride7, [stride3 + 4 * stride]
-
-        ; 8-f
-        movdqa      xmm0, s8
-        movdqa      xmm1, xmm0
-        punpcklbw   xmm0, s9                ; 80 90
-        punpckhbw   xmm1, s9                ; 88 98
-
-        movdqa      xmm2, s10
-        movdqa      xmm3, xmm2
-        punpcklbw   xmm2, s11 ; a0 b0
-        punpckhbw   xmm3, s11 ; a8 b8
-
-        movdqa      xmm4, xmm0
-        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
-        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
-
-        movdqa      xmm2, xmm1
-        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
-        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
-
-        ; using xmm[0124]
-        ; work on next 4 rows
-
-        movdqa      xmm3, s12
-        movdqa      xmm5, xmm3
-        punpcklbw   xmm3, s13 ; c0 d0
-        punpckhbw   xmm5, s13 ; c8 d8
-
-        movdqa      xmm6, s14
-        movdqa      xmm7, xmm6
-        punpcklbw   xmm6, s15 ; e0 f0
-        punpckhbw   xmm7, s15 ; e8 f8
-
-        movdqa      xmm8, xmm3
-        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
-        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
-
-        movdqa      xmm6, xmm5
-        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
-        punpckhwd   xmm6, xmm7              ; cc dc ec fc
-
-        ; pull the third and fourth sets together
-
-        movdqa      xmm7, xmm0
-        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
-        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
-
-        movdqa      xmm3, xmm4
-        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
-        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
-
-        movdqa      xmm8, xmm1
-        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
-        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
-
-        movdqa      xmm5, xmm2
-        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
-        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
-
-        ; save the calculations. we only have 15 registers ...
-        movdqa      i0, xmm0
-        movdqa      i1, xmm7
-        movdqa      i2, xmm4
-        movdqa      i3, xmm3
-        movdqa      i4, xmm1
-        movdqa      i5, xmm8
-        movdqa      i6, xmm2
-        movdqa      i7, xmm5
-
-        ; 0-7
-        movdqa      xmm0, s0
-        movdqa      xmm1, xmm0
-        punpcklbw   xmm0, s1 ; 00 10
-        punpckhbw   xmm1, s1 ; 08 18
-
-        movdqa      xmm2, s2
-        movdqa      xmm3, xmm2
-        punpcklbw   xmm2, s3 ; 20 30
-        punpckhbw   xmm3, s3 ; 28 38
-
-        movdqa      xmm4, xmm0
-        punpcklwd   xmm0, xmm2              ; 00 10 20 30
-        punpckhwd   xmm4, xmm2              ; 04 14 24 34
-
-        movdqa      xmm2, xmm1
-        punpcklwd   xmm1, xmm3              ; 08 18 28 38
-        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
-
-        ; using xmm[0124]
-        ; work on next 4 rows
-
-        movdqa      xmm3, s4
-        movdqa      xmm5, xmm3
-        punpcklbw   xmm3, s5 ; 40 50
-        punpckhbw   xmm5, s5 ; 48 58
-
-        movdqa      xmm6, s6
-        movdqa      xmm7, xmm6
-        punpcklbw   xmm6, s7   ; 60 70
-        punpckhbw   xmm7, s7   ; 68 78
-
-        movdqa      xmm8, xmm3
-        punpcklwd   xmm3, xmm6              ; 40 50 60 70
-        punpckhwd   xmm8, xmm6              ; 44 54 64 74
-
-        movdqa      xmm6, xmm5
-        punpcklwd   xmm5, xmm7              ; 48 58 68 78
-        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
-
-        ; pull the first two sets together
-
-        movdqa      xmm7, xmm0
-        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
-        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
-
-        movdqa      xmm3, xmm4
-        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
-        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
-
-        movdqa      xmm8, xmm1
-        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
-        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
-
-        movdqa      xmm5, xmm2
-        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
-        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
-        ; final combination
-
-        movdqa      xmm6, xmm0
-        punpcklqdq  xmm0, i0
-        punpckhqdq  xmm6, i0
-
-        movdqa      xmm9, xmm7
-        punpcklqdq  xmm7, i1
-        punpckhqdq  xmm9, i1
-
-        movdqa      xmm10, xmm4
-        punpcklqdq  xmm4, i2
-        punpckhqdq  xmm10, i2
-
-        movdqa      xmm11, xmm3
-        punpcklqdq  xmm3, i3
-        punpckhqdq  xmm11, i3
-
-        movdqa      xmm12, xmm1
-        punpcklqdq  xmm1, i4
-        punpckhqdq  xmm12, i4
-
-        movdqa      xmm13, xmm8
-        punpcklqdq  xmm8, i5
-        punpckhqdq  xmm13, i5
-
-        movdqa      xmm14, xmm2
-        punpcklqdq  xmm2, i6
-        punpckhqdq  xmm14, i6
-
-        movdqa      xmm15, xmm5
-        punpcklqdq  xmm5, i7
-        punpckhqdq  xmm15, i7
-
-        movdqa      i0, xmm0
-        movdqa      i1, xmm6
-        movdqa      i2, xmm7
-        movdqa      i3, xmm9
-        movdqa      i4, xmm4
-        movdqa      i5, xmm10
-        movdqa      i6, xmm3
-        movdqa      i7, xmm11
-        movdqa      i8, xmm1
-        movdqa      i9, xmm12
-        movdqa      i10, xmm8
-        movdqa      i11, xmm13
-        movdqa      i12, xmm2
-        movdqa      i13, xmm14
-        movdqa      i14, xmm5
-        movdqa      i15, xmm15
-
-; TRANSPOSED DATA AVAILABLE ON THE STACK
-
-        movdqa      xmm12, xmm6
-        movdqa      xmm13, xmm7
-
-        pxor        zero, zero
-
-LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
-
-        movdqa       xmm1, i2
-        movdqa       xmm2, i3
-        movdqa       xmm8, i4
-        movdqa       xmm9, i5
-LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
-        movdqa       i2, xmm1
-        movdqa       i3, xmm2
-
-; second set
-        movdqa       i4, xmm8
-        movdqa       i5, xmm9
-
-        movdqa       xmm0, i6
-        movdqa       xmm1, i7
-        movdqa       xmm2, i8
-        movdqa       xmm4, i9
-        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
-        movdqa       xmm11, i11
-LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
-
-        movdqa       xmm0, i6
-        movdqa       xmm1, i7
-        movdqa       xmm3, i8
-        movdqa       xmm4, i9
-LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
-        movdqa       i6, xmm0
-        movdqa       i7, xmm1
-
-; last set
-        movdqa       i8, xmm3
-        movdqa       i9, xmm4
-
-        movdqa       xmm0, i10
-        movdqa       xmm1, i11
-        movdqa       xmm2, i12
-        movdqa       xmm8, i13
-        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
-        movdqa       xmm11, i15
-LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
-
-        movdqa       xmm0, i10
-        movdqa       xmm1, i11
-        movdqa       xmm4, i12
-        movdqa       xmm8, i13
-LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
-        movdqa       i10, xmm0
-        movdqa       i11, xmm1
-        movdqa       i12, xmm4
-        movdqa       i13, xmm8
-
-
-; RESHUFFLE AND WRITE OUT
-        ; 8-f
-        movdqa      xmm0, i8
-        movdqa      xmm1, xmm0
-        punpcklbw   xmm0, i9                ; 80 90
-        punpckhbw   xmm1, i9                ; 88 98
-
-        movdqa      xmm2, i10
-        movdqa      xmm3, xmm2
-        punpcklbw   xmm2, i11               ; a0 b0
-        punpckhbw   xmm3, i11               ; a8 b8
-
-        movdqa      xmm4, xmm0
-        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
-        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
-
-        movdqa      xmm2, xmm1
-        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
-        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
-
-        ; using xmm[0124]
-        ; work on next 4 rows
-
-        movdqa      xmm3, i12
-        movdqa      xmm5, xmm3
-        punpcklbw   xmm3, i13               ; c0 d0
-        punpckhbw   xmm5, i13               ; c8 d8
-
-        movdqa      xmm6, i14
-        movdqa      xmm7, xmm6
-        punpcklbw   xmm6, i15               ; e0 f0
-        punpckhbw   xmm7, i15               ; e8 f8
-
-        movdqa      xmm8, xmm3
-        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
-        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
-
-        movdqa      xmm6, xmm5
-        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
-        punpckhwd   xmm6, xmm7              ; cc dc ec fc
-
-        ; pull the third and fourth sets together
-
-        movdqa      xmm7, xmm0
-        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
-        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
-
-        movdqa      xmm3, xmm4
-        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
-        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
-
-        movdqa      xmm8, xmm1
-        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
-        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
-
-        movdqa      xmm5, xmm2
-        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
-        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
-
-        ; save the calculations. we only have 15 registers ...
-        movdqa      i8, xmm0
-        movdqa      i9, xmm7
-        movdqa      i10, xmm4
-        movdqa      i11, xmm3
-        movdqa      i12, xmm1
-        movdqa      i13, xmm8
-        movdqa      i14, xmm2
-        movdqa      i15, xmm5
-
-        ; 0-7
-        movdqa      xmm0, i0
-        movdqa      xmm1, xmm0
-        punpcklbw   xmm0, i1                ; 00 10
-        punpckhbw   xmm1, i1                ; 08 18
-
-        movdqa      xmm2, i2
-        movdqa      xmm3, xmm2
-        punpcklbw   xmm2, i3                ; 20 30
-        punpckhbw   xmm3, i3                ; 28 38
-
-        movdqa      xmm4, xmm0
-        punpcklwd   xmm0, xmm2              ; 00 10 20 30
-        punpckhwd   xmm4, xmm2              ; 04 14 24 34
-
-        movdqa      xmm2, xmm1
-        punpcklwd   xmm1, xmm3              ; 08 18 28 38
-        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
-
-        ; using xmm[0124]
-        ; work on next 4 rows
-
-        movdqa      xmm3, i4
-        movdqa      xmm5, xmm3
-        punpcklbw   xmm3, i5                ; 40 50
-        punpckhbw   xmm5, i5                ; 48 58
-
-        movdqa      xmm6, i6
-        movdqa      xmm7, xmm6
-        punpcklbw   xmm6, i7                ; 60 70
-        punpckhbw   xmm7, i7                ; 68 78
-
-        movdqa      xmm8, xmm3
-        punpcklwd   xmm3, xmm6              ; 40 50 60 70
-        punpckhwd   xmm8, xmm6              ; 44 54 64 74
-
-        movdqa      xmm6, xmm5
-        punpcklwd   xmm5, xmm7              ; 48 58 68 78
-        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
-
-        ; pull the first two sets together
-
-        movdqa      xmm7, xmm0
-        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
-        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
-
-        movdqa      xmm3, xmm4
-        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
-        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
-
-        movdqa      xmm8, xmm1
-        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
-        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
-
-        movdqa      xmm5, xmm2
-        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
-        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
-        ; final combination
-
-        movdqa      xmm6, xmm0
-        punpcklqdq  xmm0, i8
-        punpckhqdq  xmm6, i8
-
-        movdqa      xmm9, xmm7
-        punpcklqdq  xmm7, i9
-        punpckhqdq  xmm9, i9
-
-        movdqa      xmm10, xmm4
-        punpcklqdq  xmm4, i10
-        punpckhqdq  xmm10, i10
-
-        movdqa      xmm11, xmm3
-        punpcklqdq  xmm3, i11
-        punpckhqdq  xmm11, i11
-
-        movdqa      xmm12, xmm1
-        punpcklqdq  xmm1, i12
-        punpckhqdq  xmm12, i12
-
-        movdqa      xmm13, xmm8
-        punpcklqdq  xmm8, i13
-        punpckhqdq  xmm13, i13
-
-        movdqa      xmm14, xmm2
-        punpcklqdq  xmm2, i14
-        punpckhqdq  xmm14, i14
-
-        movdqa      xmm15, xmm5
-        punpcklqdq  xmm5, i15
-        punpckhqdq  xmm15, i15
-
-        movdqa      s0, xmm0
-        movdqa      s1, xmm6
-        movdqa      s2, xmm7
-        movdqa      s3, xmm9
-        movdqa      s4, xmm4
-        movdqa      s5, xmm10
-        movdqa      s6, xmm3
-        movdqa      s7, xmm11
-        movdqa      s8, xmm1
-        movdqa      s9, xmm12
-        movdqa      s10, xmm8
-        movdqa      s11, xmm13
-        movdqa      s12, xmm2
-        movdqa      s13, xmm14
-        movdqa      s14, xmm5
-        movdqa      s15, xmm15
-
-    ; free stack space
-    add          rsp, stack_size
-
-    ; un-ALIGN_STACK
-    pop          rsp
-
-%if LIBVPX_YASM_WIN64
-    pop    r13
-    pop    r12
-    RESTORE_XMM
-    pop    rbp
-%endif
-
-    ret
-
-SECTION_RODATA
-align 16
-te0:
-    times 16 db 0xe0
-align 16
-t7f:
-    times 16 db 0x7f
-align 16
-tfe:
-    times 16 db 0xfe
-align 16
-t1f:
-    times 16 db 0x1f
-align 16
-t80:
-    times 16 db 0x80
-align 16
-t1:
-    times 16 db 0x01
-align 16
-t3:
-    times 16 db 0x03
-align 16
-t4:
-    times 16 db 0x04
--- /dev/null
+++ b/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
@@ -1,0 +1,815 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro LF_ABS 2
+        ; %1 value not preserved
+        ; %2 value preserved
+        ; output in %1
+        movdqa      scratch1, %2            ; v2
+
+        psubusb     scratch1, %1            ; v2 - v1
+        psubusb     %1, %2                  ; v1 - v2
+        por         %1, scratch1            ; abs(v2 - v1)
+%endmacro
+
+%macro LF_FILTER_HEV_MASK 8-9
+
+        LF_ABS      %1, %2                  ; abs(p3 - p2)
+        LF_ABS      %2, %3                  ; abs(p2 - p1)
+        pmaxub      %1, %2                  ; accumulate mask
+%if %0 == 8
+        movdqa      scratch2, %3            ; save p1
+        LF_ABS      scratch2, %4            ; abs(p1 - p0)
+%endif
+        LF_ABS      %4, %5                  ; abs(p0 - q0)
+        LF_ABS      %5, %6                  ; abs(q0 - q1)
+%if %0 == 8
+        pmaxub      %5, scratch2            ; accumulate hev
+%else
+        pmaxub      %5, %9
+%endif
+        pmaxub      %1, %5                  ; accumulate mask
+
+        LF_ABS      %3, %6                  ; abs(p1 - q1)
+        LF_ABS      %6, %7                  ; abs(q1 - q2)
+        pmaxub      %1, %6                  ; accumulate mask
+        LF_ABS      %7, %8                  ; abs(q2 - q3)
+        pmaxub      %1, %7                  ; accumulate mask
+
+        paddusb     %4, %4                  ; 2 * abs(p0 - q0)
+        pand        %3, [GLOBAL(tfe)]
+        psrlw       %3, 1                   ; abs(p1 - q1) / 2
+        paddusb     %4, %3                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+        psubusb     %1, [limit]
+        psubusb     %4, [blimit]
+        por         %1, %4
+        pcmpeqb     %1, zero                ; mask
+
+        psubusb     %5, [thresh]
+        pcmpeqb     %5, zero                ; ~hev
+%endmacro
+
+%macro LF_FILTER 6
+        ; %1-%4: p1-q1
+        ; %5: mask
+        ; %6: hev
+
+        movdqa      scratch2, %6            ; save hev
+
+        pxor        %1, [GLOBAL(t80)]       ; ps1
+        pxor        %4, [GLOBAL(t80)]       ; qs1
+        movdqa      scratch1, %1
+        psubsb      scratch1, %4            ; signed_char_clamp(ps1 - qs1)
+        pandn       scratch2, scratch1      ; vp8_filter &= hev
+
+        pxor        %2, [GLOBAL(t80)]       ; ps0
+        pxor        %3, [GLOBAL(t80)]       ; qs0
+        movdqa      scratch1, %3
+        psubsb      scratch1, %2            ; qs0 - ps0
+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
+        pand        %5, scratch2            ; &= mask
+
+        movdqa      scratch2, %5
+        paddsb      %5, [GLOBAL(t4)]        ; Filter1
+        paddsb      scratch2, [GLOBAL(t3)]  ; Filter2
+
+        ; Filter1 >> 3
+        movdqa      scratch1, zero
+        pcmpgtb     scratch1, %5
+        psrlw       %5, 3
+        pand        scratch1, [GLOBAL(te0)]
+        pand        %5, [GLOBAL(t1f)]
+        por         %5, scratch1
+
+        psubsb      %3, %5                  ; qs0 - Filter1
+        pxor        %3, [GLOBAL(t80)]
+
+        ; Filter2 >> 3
+        movdqa      scratch1, zero
+        pcmpgtb     scratch1, scratch2
+        psrlw       scratch2, 3
+        pand        scratch1, [GLOBAL(te0)]
+        pand        scratch2, [GLOBAL(t1f)]
+        por         scratch2, scratch1
+
+        paddsb      %2, scratch2            ; ps0 + Filter2
+        pxor        %2, [GLOBAL(t80)]
+
+        ; outer tap adjustments
+        paddsb      %5, [GLOBAL(t1)]
+        movdqa      scratch1, zero
+        pcmpgtb     scratch1, %5
+        psrlw       %5, 1
+        pand        scratch1, [GLOBAL(t80)]
+        pand        %5, [GLOBAL(t7f)]
+        por         %5, scratch1
+        pand        %5, %6                  ; vp8_filter &= ~hev
+
+        psubsb      %4, %5                  ; qs1 - vp8_filter
+        pxor        %4, [GLOBAL(t80)]
+
+        paddsb      %1, %5                  ; ps1 + vp8_filter
+        pxor        %1, [GLOBAL(t80)]
+%endmacro
+
+;void vp8_loop_filter_bh_y_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh
+;)
+global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
+sym(vp8_loop_filter_bh_y_sse2):
+
+%if LIBVPX_YASM_WIN64
+    %define src      rcx ; src_ptr
+    %define stride   rdx ; src_pixel_step
+    %define blimit   r8
+    %define limit    r9
+    %define thresh   r10
+
+    %define spp      rax
+    %define stride3  r11
+    %define stride5  r12
+    %define stride7  r13
+
+    push    rbp
+    mov     rbp, rsp
+    SAVE_XMM 11
+    push    r12
+    push    r13
+    mov     thresh, arg(4)
+%else
+    %define src      rdi ; src_ptr
+    %define stride   rsi ; src_pixel_step
+    %define blimit   rdx
+    %define limit    rcx
+    %define thresh   r8
+
+    %define spp      rax
+    %define stride3  r9
+    %define stride5  r10
+    %define stride7  r11
+%endif
+
+    %define scratch1 xmm5
+    %define scratch2 xmm6
+    %define zero     xmm7
+
+    %define i0       [src]
+    %define i1       [spp]
+    %define i2       [src + 2 * stride]
+    %define i3       [spp + 2 * stride]
+    %define i4       [src + 4 * stride]
+    %define i5       [spp + 4 * stride]
+    %define i6       [src + 2 * stride3]
+    %define i7       [spp + 2 * stride3]
+    %define i8       [src + 8 * stride]
+    %define i9       [spp + 8 * stride]
+    %define i10      [src + 2 * stride5]
+    %define i11      [spp + 2 * stride5]
+    %define i12      [src + 4 * stride3]
+    %define i13      [spp + 4 * stride3]
+    %define i14      [src + 2 * stride7]
+    %define i15      [spp + 2 * stride7]
+
+    ; prep work
+    lea         spp, [src + stride]
+    lea         stride3, [stride + 2 * stride]
+    lea         stride5, [stride3 + 2 * stride]
+    lea         stride7, [stride3 + 4 * stride]
+    pxor        zero, zero
+
+        ; load the first set into registers
+        movdqa       xmm0, i0
+        movdqa       xmm1, i1
+        movdqa       xmm2, i2
+        movdqa       xmm3, i3
+        movdqa       xmm4, i4
+        movdqa       xmm8, i5
+        movdqa       xmm9, i6   ; q2, will contain abs(p1-p0)
+        movdqa       xmm10, i7
+LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
+
+        movdqa       xmm1, i2
+        movdqa       xmm2, i3
+        movdqa       xmm3, i4
+        movdqa       xmm8, i5
+LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
+        movdqa       i2, xmm1
+        movdqa       i3, xmm2
+
+; second set
+        movdqa       i4, xmm3
+        movdqa       i5, xmm8
+
+        movdqa       xmm0, i6
+        movdqa       xmm1, i7
+        movdqa       xmm2, i8
+        movdqa       xmm4, i9
+        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
+        movdqa       xmm11, i11
+LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
+
+        movdqa       xmm0, i6
+        movdqa       xmm1, i7
+        movdqa       xmm4, i8
+        movdqa       xmm8, i9
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+        movdqa       i6, xmm0
+        movdqa       i7, xmm1
+
+; last set
+        movdqa       i8, xmm4
+        movdqa       i9, xmm8
+
+        movdqa       xmm0, i10
+        movdqa       xmm1, i11
+        movdqa       xmm2, i12
+        movdqa       xmm3, i13
+        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
+        movdqa       xmm11, i15
+LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
+
+        movdqa       xmm0, i10
+        movdqa       xmm1, i11
+        movdqa       xmm3, i12
+        movdqa       xmm8, i13
+LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
+        movdqa       i10, xmm0
+        movdqa       i11, xmm1
+        movdqa       i12, xmm3
+        movdqa       i13, xmm8
+
+%if LIBVPX_YASM_WIN64
+    pop    r13
+    pop    r12
+    RESTORE_XMM
+    pop    rbp
+%endif
+
+    ret
+
+
+;void vp8_loop_filter_bv_y_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh
+;)
+
+global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
+sym(vp8_loop_filter_bv_y_sse2):
+
+%if LIBVPX_YASM_WIN64
+    %define src      rcx ; src_ptr
+    %define stride   rdx ; src_pixel_step
+    %define blimit   r8
+    %define limit    r9
+    %define thresh   r10
+
+    %define spp      rax
+    %define stride3  r11
+    %define stride5  r12
+    %define stride7  r13
+
+    push    rbp
+    mov     rbp, rsp
+    SAVE_XMM 15
+    push    r12
+    push    r13
+    mov     thresh, arg(4)
+%else
+    %define src      rdi
+    %define stride   rsi
+    %define blimit   rdx
+    %define limit    rcx
+    %define thresh   r8
+
+    %define spp      rax
+    %define stride3  r9
+    %define stride5  r10
+    %define stride7  r11
+%endif
+
+    %define scratch1 xmm5
+    %define scratch2 xmm6
+    %define zero     xmm7
+
+    %define s0       [src]
+    %define s1       [spp]
+    %define s2       [src + 2 * stride]
+    %define s3       [spp + 2 * stride]
+    %define s4       [src + 4 * stride]
+    %define s5       [spp + 4 * stride]
+    %define s6       [src + 2 * stride3]
+    %define s7       [spp + 2 * stride3]
+    %define s8       [src + 8 * stride]
+    %define s9       [spp + 8 * stride]
+    %define s10      [src + 2 * stride5]
+    %define s11      [spp + 2 * stride5]
+    %define s12      [src + 4 * stride3]
+    %define s13      [spp + 4 * stride3]
+    %define s14      [src + 2 * stride7]
+    %define s15      [spp + 2 * stride7]
+
+    %define i0       [rsp]
+    %define i1       [rsp + 16]
+    %define i2       [rsp + 32]
+    %define i3       [rsp + 48]
+    %define i4       [rsp + 64]
+    %define i5       [rsp + 80]
+    %define i6       [rsp + 96]
+    %define i7       [rsp + 112]
+    %define i8       [rsp + 128]
+    %define i9       [rsp + 144]
+    %define i10      [rsp + 160]
+    %define i11      [rsp + 176]
+    %define i12      [rsp + 192]
+    %define i13      [rsp + 208]
+    %define i14      [rsp + 224]
+    %define i15      [rsp + 240]
+
+    ALIGN_STACK 16, rax
+
+    ; reserve stack space
+    %define      temp_storage  0 ; size is 256 (16*16)
+    %define      stack_size 256
+    sub          rsp, stack_size
+
+    ; prep work
+    lea         spp, [src + stride]
+    lea         stride3, [stride + 2 * stride]
+    lea         stride5, [stride3 + 2 * stride]
+    lea         stride7, [stride3 + 4 * stride]
+
+        ; 8-f
+        movdqa      xmm0, s8
+        movdqa      xmm1, xmm0
+        punpcklbw   xmm0, s9                ; 80 90
+        punpckhbw   xmm1, s9                ; 88 98
+
+        movdqa      xmm2, s10
+        movdqa      xmm3, xmm2
+        punpcklbw   xmm2, s11 ; a0 b0
+        punpckhbw   xmm3, s11 ; a8 b8
+
+        movdqa      xmm4, xmm0
+        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
+        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
+
+        movdqa      xmm2, xmm1
+        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
+        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
+
+        ; using xmm[0124]
+        ; work on next 4 rows
+
+        movdqa      xmm3, s12
+        movdqa      xmm5, xmm3
+        punpcklbw   xmm3, s13 ; c0 d0
+        punpckhbw   xmm5, s13 ; c8 d8
+
+        movdqa      xmm6, s14
+        movdqa      xmm7, xmm6
+        punpcklbw   xmm6, s15 ; e0 f0
+        punpckhbw   xmm7, s15 ; e8 f8
+
+        movdqa      xmm8, xmm3
+        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
+        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
+
+        movdqa      xmm6, xmm5
+        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
+        punpckhwd   xmm6, xmm7              ; cc dc ec fc
+
+        ; pull the third and fourth sets together
+
+        movdqa      xmm7, xmm0
+        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
+        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
+
+        movdqa      xmm3, xmm4
+        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
+        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
+
+        movdqa      xmm8, xmm1
+        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
+        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
+
+        movdqa      xmm5, xmm2
+        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
+        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
+
+        ; save the calculations. we only have 15 registers ...
+        movdqa      i0, xmm0
+        movdqa      i1, xmm7
+        movdqa      i2, xmm4
+        movdqa      i3, xmm3
+        movdqa      i4, xmm1
+        movdqa      i5, xmm8
+        movdqa      i6, xmm2
+        movdqa      i7, xmm5
+
+        ; 0-7
+        movdqa      xmm0, s0
+        movdqa      xmm1, xmm0
+        punpcklbw   xmm0, s1 ; 00 10
+        punpckhbw   xmm1, s1 ; 08 18
+
+        movdqa      xmm2, s2
+        movdqa      xmm3, xmm2
+        punpcklbw   xmm2, s3 ; 20 30
+        punpckhbw   xmm3, s3 ; 28 38
+
+        movdqa      xmm4, xmm0
+        punpcklwd   xmm0, xmm2              ; 00 10 20 30
+        punpckhwd   xmm4, xmm2              ; 04 14 24 34
+
+        movdqa      xmm2, xmm1
+        punpcklwd   xmm1, xmm3              ; 08 18 28 38
+        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
+
+        ; using xmm[0124]
+        ; work on next 4 rows
+
+        movdqa      xmm3, s4
+        movdqa      xmm5, xmm3
+        punpcklbw   xmm3, s5 ; 40 50
+        punpckhbw   xmm5, s5 ; 48 58
+
+        movdqa      xmm6, s6
+        movdqa      xmm7, xmm6
+        punpcklbw   xmm6, s7   ; 60 70
+        punpckhbw   xmm7, s7   ; 68 78
+
+        movdqa      xmm8, xmm3
+        punpcklwd   xmm3, xmm6              ; 40 50 60 70
+        punpckhwd   xmm8, xmm6              ; 44 54 64 74
+
+        movdqa      xmm6, xmm5
+        punpcklwd   xmm5, xmm7              ; 48 58 68 78
+        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
+
+        ; pull the first two sets together
+
+        movdqa      xmm7, xmm0
+        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
+        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
+
+        movdqa      xmm3, xmm4
+        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
+        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
+
+        movdqa      xmm8, xmm1
+        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
+        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+        movdqa      xmm5, xmm2
+        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
+        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
+        ; final combination
+
+        movdqa      xmm6, xmm0
+        punpcklqdq  xmm0, i0
+        punpckhqdq  xmm6, i0
+
+        movdqa      xmm9, xmm7
+        punpcklqdq  xmm7, i1
+        punpckhqdq  xmm9, i1
+
+        movdqa      xmm10, xmm4
+        punpcklqdq  xmm4, i2
+        punpckhqdq  xmm10, i2
+
+        movdqa      xmm11, xmm3
+        punpcklqdq  xmm3, i3
+        punpckhqdq  xmm11, i3
+
+        movdqa      xmm12, xmm1
+        punpcklqdq  xmm1, i4
+        punpckhqdq  xmm12, i4
+
+        movdqa      xmm13, xmm8
+        punpcklqdq  xmm8, i5
+        punpckhqdq  xmm13, i5
+
+        movdqa      xmm14, xmm2
+        punpcklqdq  xmm2, i6
+        punpckhqdq  xmm14, i6
+
+        movdqa      xmm15, xmm5
+        punpcklqdq  xmm5, i7
+        punpckhqdq  xmm15, i7
+
+        movdqa      i0, xmm0
+        movdqa      i1, xmm6
+        movdqa      i2, xmm7
+        movdqa      i3, xmm9
+        movdqa      i4, xmm4
+        movdqa      i5, xmm10
+        movdqa      i6, xmm3
+        movdqa      i7, xmm11
+        movdqa      i8, xmm1
+        movdqa      i9, xmm12
+        movdqa      i10, xmm8
+        movdqa      i11, xmm13
+        movdqa      i12, xmm2
+        movdqa      i13, xmm14
+        movdqa      i14, xmm5
+        movdqa      i15, xmm15
+
+; TRANSPOSED DATA AVAILABLE ON THE STACK
+
+        movdqa      xmm12, xmm6
+        movdqa      xmm13, xmm7
+
+        pxor        zero, zero
+
+LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
+
+        movdqa       xmm1, i2
+        movdqa       xmm2, i3
+        movdqa       xmm8, i4
+        movdqa       xmm9, i5
+LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
+        movdqa       i2, xmm1
+        movdqa       i3, xmm2
+
+; second set
+        movdqa       i4, xmm8
+        movdqa       i5, xmm9
+
+        movdqa       xmm0, i6
+        movdqa       xmm1, i7
+        movdqa       xmm2, i8
+        movdqa       xmm4, i9
+        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
+        movdqa       xmm11, i11
+LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
+
+        movdqa       xmm0, i6
+        movdqa       xmm1, i7
+        movdqa       xmm3, i8
+        movdqa       xmm4, i9
+LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
+        movdqa       i6, xmm0
+        movdqa       i7, xmm1
+
+; last set
+        movdqa       i8, xmm3
+        movdqa       i9, xmm4
+
+        movdqa       xmm0, i10
+        movdqa       xmm1, i11
+        movdqa       xmm2, i12
+        movdqa       xmm8, i13
+        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
+        movdqa       xmm11, i15
+LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
+
+        movdqa       xmm0, i10
+        movdqa       xmm1, i11
+        movdqa       xmm4, i12
+        movdqa       xmm8, i13
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+        movdqa       i10, xmm0
+        movdqa       i11, xmm1
+        movdqa       i12, xmm4
+        movdqa       i13, xmm8
+
+
+; RESHUFFLE AND WRITE OUT
+        ; 8-f
+        movdqa      xmm0, i8
+        movdqa      xmm1, xmm0
+        punpcklbw   xmm0, i9                ; 80 90
+        punpckhbw   xmm1, i9                ; 88 98
+
+        movdqa      xmm2, i10
+        movdqa      xmm3, xmm2
+        punpcklbw   xmm2, i11               ; a0 b0
+        punpckhbw   xmm3, i11               ; a8 b8
+
+        movdqa      xmm4, xmm0
+        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
+        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
+
+        movdqa      xmm2, xmm1
+        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
+        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
+
+        ; using xmm[0124]
+        ; work on next 4 rows
+
+        movdqa      xmm3, i12
+        movdqa      xmm5, xmm3
+        punpcklbw   xmm3, i13               ; c0 d0
+        punpckhbw   xmm5, i13               ; c8 d8
+
+        movdqa      xmm6, i14
+        movdqa      xmm7, xmm6
+        punpcklbw   xmm6, i15               ; e0 f0
+        punpckhbw   xmm7, i15               ; e8 f8
+
+        movdqa      xmm8, xmm3
+        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
+        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
+
+        movdqa      xmm6, xmm5
+        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
+        punpckhwd   xmm6, xmm7              ; cc dc ec fc
+
+        ; pull the third and fourth sets together
+
+        movdqa      xmm7, xmm0
+        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
+        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
+
+        movdqa      xmm3, xmm4
+        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
+        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
+
+        movdqa      xmm8, xmm1
+        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
+        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
+
+        movdqa      xmm5, xmm2
+        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
+        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
+
+        ; save the calculations. we only have 15 registers ...
+        movdqa      i8, xmm0
+        movdqa      i9, xmm7
+        movdqa      i10, xmm4
+        movdqa      i11, xmm3
+        movdqa      i12, xmm1
+        movdqa      i13, xmm8
+        movdqa      i14, xmm2
+        movdqa      i15, xmm5
+
+        ; 0-7
+        movdqa      xmm0, i0
+        movdqa      xmm1, xmm0
+        punpcklbw   xmm0, i1                ; 00 10
+        punpckhbw   xmm1, i1                ; 08 18
+
+        movdqa      xmm2, i2
+        movdqa      xmm3, xmm2
+        punpcklbw   xmm2, i3                ; 20 30
+        punpckhbw   xmm3, i3                ; 28 38
+
+        movdqa      xmm4, xmm0
+        punpcklwd   xmm0, xmm2              ; 00 10 20 30
+        punpckhwd   xmm4, xmm2              ; 04 14 24 34
+
+        movdqa      xmm2, xmm1
+        punpcklwd   xmm1, xmm3              ; 08 18 28 38
+        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
+
+        ; using xmm[0124]
+        ; work on next 4 rows
+
+        movdqa      xmm3, i4
+        movdqa      xmm5, xmm3
+        punpcklbw   xmm3, i5                ; 40 50
+        punpckhbw   xmm5, i5                ; 48 58
+
+        movdqa      xmm6, i6
+        movdqa      xmm7, xmm6
+        punpcklbw   xmm6, i7                ; 60 70
+        punpckhbw   xmm7, i7                ; 68 78
+
+        movdqa      xmm8, xmm3
+        punpcklwd   xmm3, xmm6              ; 40 50 60 70
+        punpckhwd   xmm8, xmm6              ; 44 54 64 74
+
+        movdqa      xmm6, xmm5
+        punpcklwd   xmm5, xmm7              ; 48 58 68 78
+        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
+
+        ; pull the first two sets together
+
+        movdqa      xmm7, xmm0
+        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
+        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
+
+        movdqa      xmm3, xmm4
+        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
+        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
+
+        movdqa      xmm8, xmm1
+        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
+        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+        movdqa      xmm5, xmm2
+        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
+        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
+        ; final combination
+
+        movdqa      xmm6, xmm0
+        punpcklqdq  xmm0, i8
+        punpckhqdq  xmm6, i8
+
+        movdqa      xmm9, xmm7
+        punpcklqdq  xmm7, i9
+        punpckhqdq  xmm9, i9
+
+        movdqa      xmm10, xmm4
+        punpcklqdq  xmm4, i10
+        punpckhqdq  xmm10, i10
+
+        movdqa      xmm11, xmm3
+        punpcklqdq  xmm3, i11
+        punpckhqdq  xmm11, i11
+
+        movdqa      xmm12, xmm1
+        punpcklqdq  xmm1, i12
+        punpckhqdq  xmm12, i12
+
+        movdqa      xmm13, xmm8
+        punpcklqdq  xmm8, i13
+        punpckhqdq  xmm13, i13
+
+        movdqa      xmm14, xmm2
+        punpcklqdq  xmm2, i14
+        punpckhqdq  xmm14, i14
+
+        movdqa      xmm15, xmm5
+        punpcklqdq  xmm5, i15
+        punpckhqdq  xmm15, i15
+
+        movdqa      s0, xmm0
+        movdqa      s1, xmm6
+        movdqa      s2, xmm7
+        movdqa      s3, xmm9
+        movdqa      s4, xmm4
+        movdqa      s5, xmm10
+        movdqa      s6, xmm3
+        movdqa      s7, xmm11
+        movdqa      s8, xmm1
+        movdqa      s9, xmm12
+        movdqa      s10, xmm8
+        movdqa      s11, xmm13
+        movdqa      s12, xmm2
+        movdqa      s13, xmm14
+        movdqa      s14, xmm5
+        movdqa      s15, xmm15
+
+    ; free stack space
+    add          rsp, stack_size
+
+    ; un-ALIGN_STACK
+    pop          rsp
+
+%if LIBVPX_YASM_WIN64
+    pop    r13
+    pop    r12
+    RESTORE_XMM
+    pop    rbp
+%endif
+
+    ret
+
+SECTION_RODATA
+align 16
+te0:
+    times 16 db 0xe0
+align 16
+t7f:
+    times 16 db 0x7f
+align 16
+tfe:
+    times 16 db 0xfe
+align 16
+t1f:
+    times 16 db 0x1f
+align 16
+t80:
+    times 16 db 0x80
+align 16
+t1:
+    times 16 db 0x01
+align 16
+t3:
+    times 16 db 0x03
+align 16
+t4:
+    times 16 db 0x04
--- a/vp8/encoder/x86/ssim_opt.asm
+++ /dev/null
@@ -1,216 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
-%macro TABULATE_SSIM 0
-        paddusw         xmm15, xmm3  ; sum_s
-        paddusw         xmm14, xmm4  ; sum_r
-        movdqa          xmm1, xmm3
-        pmaddwd         xmm1, xmm1
-        paddd           xmm13, xmm1 ; sum_sq_s
-        movdqa          xmm2, xmm4
-        pmaddwd         xmm2, xmm2
-        paddd           xmm12, xmm2 ; sum_sq_r
-        pmaddwd         xmm3, xmm4
-        paddd           xmm11, xmm3  ; sum_sxr
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_Q 1
-        movdqa          xmm2,%1
-        punpckldq       %1,xmm0
-        punpckhdq       xmm2,xmm0
-        paddq           %1,xmm2
-        movdqa          xmm2,%1
-        punpcklqdq      %1,xmm0
-        punpckhqdq      xmm2,xmm0
-        paddq           %1,xmm2
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_W 1
-        movdqa          xmm1, %1
-        punpcklwd       %1,xmm0
-        punpckhwd       xmm1,xmm0
-        paddd           %1, xmm1
-        SUM_ACROSS_Q    %1
-%endmacro
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    unsigned long *sum_s,
-;    unsigned long *sum_r,
-;    unsigned long *sum_sq_s,
-;    unsigned long *sum_sq_r,
-;    unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp8_ssim_parms_16x16_sse2) PRIVATE
-sym(vp8_ssim_parms_16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 16      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movdqu          xmm5, [rsi]
-    movdqu          xmm6, [rdi]
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpckhbw       xmm3, xmm0 ; high_s
-    punpckhbw       xmm4, xmm0 ; high_r
-
-    TABULATE_SSIM
-
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    unsigned long *sum_s,
-;    unsigned long *sum_r,
-;    unsigned long *sum_sq_s,
-;    unsigned long *sum_sq_r,
-;    unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp8_ssim_parms_8x8_sse2) PRIVATE
-sym(vp8_ssim_parms_8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 8      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movq            xmm3, [rsi]
-    movq            xmm4, [rdi]
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- /dev/null
+++ b/vp8/encoder/x86/ssim_opt_x86_64.asm
@@ -1,0 +1,216 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_16x16_sse2) PRIVATE
+sym(vp8_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_8x8_sse2) PRIVATE
+sym(vp8_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -115,7 +115,7 @@
 endif
 
 ifeq ($(ARCH_X86_64),yes)
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2_x86_64.asm
 endif
 
 # common (c)
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -100,7 +100,7 @@
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
-VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
+VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt_x86_64.asm
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
--- a/vp9/common/x86/vp9_idct_ssse3.asm
+++ /dev/null
@@ -1,300 +1,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the inverse transformation. Part
-; of the functions are originally derived from the ffmpeg project.
-; Note that the current version applies to x86 64-bit only.
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-pd_8192:    times 4 dd 8192
-pw_16:      times 8 dw 16
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
-pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1
-%endmacro
-
-TRANSFORM_COEFFS    6270, 15137
-TRANSFORM_COEFFS    3196, 16069
-TRANSFORM_COEFFS   13623,  9102
-
-%macro PAIR_PP_COEFFS 2
-dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
-%endmacro
-
-%macro PAIR_MP_COEFFS 2
-dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2
-%endmacro
-
-%macro PAIR_MM_COEFFS 2
-dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
-%endmacro
-
-PAIR_PP_COEFFS     30274, 12540
-PAIR_PP_COEFFS      6392, 32138
-PAIR_MP_COEFFS     18204, 27246
-
-PAIR_PP_COEFFS     12540, 12540
-PAIR_PP_COEFFS     30274, 30274
-PAIR_PP_COEFFS      6392,  6392
-PAIR_PP_COEFFS     32138, 32138
-PAIR_MM_COEFFS     18204, 18204
-PAIR_PP_COEFFS     27246, 27246
-
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-%macro IDCT8_1D 0
-  SUM_SUB          0,    4,    9
-  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10
-  pmulhrsw        m0,  m12
-  pmulhrsw        m4,  m12
-  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10
-  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10
-
-  SUM_SUB          1,    5,    9
-  SUM_SUB          7,    3,    9
-  SUM_SUB          0,    6,    9
-  SUM_SUB          4,    2,    9
-  SUM_SUB          3,    5,    9
-  pmulhrsw        m3,  m12
-  pmulhrsw        m5,  m12
-
-  SUM_SUB          0,    7,    9
-  SUM_SUB          4,    3,    9
-  SUM_SUB          2,    5,    9
-  SUM_SUB          6,    1,    9
-
-  SWAP             3,    6
-  SWAP             1,    4
-%endmacro
-
-; This macro handles 8 pixels per line
-%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero
-  paddw           m%1, m11
-  paddw           m%2, m11
-  psraw           m%1, 5
-  psraw           m%2, 5
-
-  movh            m%3, [outputq]
-  movh            m%4, [outputq + strideq]
-  punpcklbw       m%3, m%5
-  punpcklbw       m%4, m%5
-  paddw           m%3, m%1
-  paddw           m%4, m%2
-  packuswb        m%3, m%5
-  packuswb        m%4, m%5
-  movh               [outputq], m%3
-  movh     [outputq + strideq], m%4
-%endmacro
-
-INIT_XMM ssse3
-; full inverse 8x8 2D-DCT transform
-cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
-  mova     m8, [pd_8192]
-  mova    m11, [pw_16]
-  mova    m12, [pw_11585x2]
-
-  lea      r3, [2 * strideq]
-
-  mova     m0, [inputq +   0]
-  mova     m1, [inputq +  16]
-  mova     m2, [inputq +  32]
-  mova     m3, [inputq +  48]
-  mova     m4, [inputq +  64]
-  mova     m5, [inputq +  80]
-  mova     m6, [inputq +  96]
-  mova     m7, [inputq + 112]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-  IDCT8_1D
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-  IDCT8_1D
-
-  pxor    m12, m12
-  ADD_STORE_8P_2X  0, 1, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  2, 3, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  4, 5, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  6, 7, 9, 10, 12
-
-  RET
-
-; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
-cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
-  mova       m8, [pd_8192]
-  mova      m11, [pw_16]
-  mova      m12, [pw_11585x2]
-
-  lea        r3, [2 * strideq]
-
-  mova       m0, [inputq +  0]
-  mova       m1, [inputq + 16]
-  mova       m2, [inputq + 32]
-  mova       m3, [inputq + 48]
-
-  punpcklwd  m0, m1
-  punpcklwd  m2, m3
-  punpckhdq  m9, m0, m2
-  punpckldq  m0, m2
-  SWAP       2, 9
-
-  ; m0 -> [0], [0]
-  ; m1 -> [1], [1]
-  ; m2 -> [2], [2]
-  ; m3 -> [3], [3]
-  punpckhqdq m10, m0, m0
-  punpcklqdq m0,  m0
-  punpckhqdq m9,  m2, m2
-  punpcklqdq m2,  m2
-  SWAP       1, 10
-  SWAP       3,  9
-
-  pmulhrsw   m0, m12
-  pmulhrsw   m2, [dpw_30274_12540]
-  pmulhrsw   m1, [dpw_6392_32138]
-  pmulhrsw   m3, [dpw_m18204_27246]
-
-  SUM_SUB    0, 2, 9
-  SUM_SUB    1, 3, 9
-
-  punpcklqdq m9, m3, m3
-  punpckhqdq m5, m3, m9
-
-  SUM_SUB    3, 5, 9
-  punpckhqdq m5, m3
-  pmulhrsw   m5, m12
-
-  punpckhqdq m9, m1, m5
-  punpcklqdq m1, m5
-  SWAP       5, 9
-
-  SUM_SUB    0, 5, 9
-  SUM_SUB    2, 1, 9
-
-  punpckhqdq m3, m0, m0
-  punpckhqdq m4, m1, m1
-  punpckhqdq m6, m5, m5
-  punpckhqdq m7, m2, m2
-
-  punpcklwd  m0, m3
-  punpcklwd  m7, m2
-  punpcklwd  m1, m4
-  punpcklwd  m6, m5
-
-  punpckhdq  m4, m0, m7
-  punpckldq  m0, m7
-  punpckhdq  m10, m1, m6
-  punpckldq  m5, m1, m6
-
-  punpckhqdq m1, m0, m5
-  punpcklqdq m0, m5
-  punpckhqdq m3, m4, m10
-  punpcklqdq m2, m4, m10
-
-
-  pmulhrsw   m0, m12
-  pmulhrsw   m6, m2, [dpw_30274_30274]
-  pmulhrsw   m4, m2, [dpw_12540_12540]
-
-  pmulhrsw   m7, m1, [dpw_32138_32138]
-  pmulhrsw   m1, [dpw_6392_6392]
-  pmulhrsw   m5, m3, [dpw_m18204_m18204]
-  pmulhrsw   m3, [dpw_27246_27246]
-
-  mova       m2, m0
-  SUM_SUB    0, 6, 9
-  SUM_SUB    2, 4, 9
-  SUM_SUB    1, 5, 9
-  SUM_SUB    7, 3, 9
-
-  SUM_SUB    3, 5, 9
-  pmulhrsw   m3, m12
-  pmulhrsw   m5, m12
-
-  SUM_SUB    0, 7, 9
-  SUM_SUB    2, 3, 9
-  SUM_SUB    4, 5, 9
-  SUM_SUB    6, 1, 9
-
-  SWAP       3, 6
-  SWAP       1, 2
-  SWAP       2, 4
-
-
-  pxor    m12, m12
-  ADD_STORE_8P_2X  0, 1, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  2, 3, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  4, 5, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  6, 7, 9, 10, 12
-
-  RET
-
-%endif
--- /dev/null
+++ b/vp9/common/x86/vp9_idct_ssse3_x86_64.asm
@@ -1,0 +1,300 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the inverse transformation. Part
+; of the functions are originally derived from the ffmpeg project.
+; Note that the current version applies to x86 64-bit only.
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+pw_16:      times 8 dw 16
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1
+%endmacro
+
+TRANSFORM_COEFFS    6270, 15137
+TRANSFORM_COEFFS    3196, 16069
+TRANSFORM_COEFFS   13623,  9102
+
+%macro PAIR_PP_COEFFS 2
+dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
+%endmacro
+
+%macro PAIR_MP_COEFFS 2
+dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2
+%endmacro
+
+%macro PAIR_MM_COEFFS 2
+dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
+%endmacro
+
+PAIR_PP_COEFFS     30274, 12540
+PAIR_PP_COEFFS      6392, 32138
+PAIR_MP_COEFFS     18204, 27246
+
+PAIR_PP_COEFFS     12540, 12540
+PAIR_PP_COEFFS     30274, 30274
+PAIR_PP_COEFFS      6392,  6392
+PAIR_PP_COEFFS     32138, 32138
+PAIR_MM_COEFFS     18204, 18204
+PAIR_PP_COEFFS     27246, 27246
+
+SECTION .text
+
+%if ARCH_X86_64
+%macro SUM_SUB 3
+  psubw  m%3, m%1, m%2
+  paddw  m%1, m%2
+  SWAP    %2, %3
+%endmacro
+
+; butterfly operation
+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
+  pmaddwd            m%1, m%3, %5
+  pmaddwd            m%2, m%3, %6
+  paddd              m%1,  %4
+  paddd              m%2,  %4
+  psrad              m%1,  14
+  psrad              m%2,  14
+%endmacro
+
+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
+; matrix transpose
+%macro INTERLEAVE_2X 4
+  punpckh%1          m%4, m%2, m%3
+  punpckl%1          m%2, m%3
+  SWAP               %3,  %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+  INTERLEAVE_2X  wd, %1, %2, %9
+  INTERLEAVE_2X  wd, %3, %4, %9
+  INTERLEAVE_2X  wd, %5, %6, %9
+  INTERLEAVE_2X  wd, %7, %8, %9
+
+  INTERLEAVE_2X  dq, %1, %3, %9
+  INTERLEAVE_2X  dq, %2, %4, %9
+  INTERLEAVE_2X  dq, %5, %7, %9
+  INTERLEAVE_2X  dq, %6, %8, %9
+
+  INTERLEAVE_2X  qdq, %1, %5, %9
+  INTERLEAVE_2X  qdq, %3, %7, %9
+  INTERLEAVE_2X  qdq, %2, %6, %9
+  INTERLEAVE_2X  qdq, %4, %8, %9
+
+  SWAP  %2, %5
+  SWAP  %4, %7
+%endmacro
+
+%macro IDCT8_1D 0
+  SUM_SUB          0,    4,    9
+  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10
+  pmulhrsw        m0,  m12
+  pmulhrsw        m4,  m12
+  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10
+  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10
+
+  SUM_SUB          1,    5,    9
+  SUM_SUB          7,    3,    9
+  SUM_SUB          0,    6,    9
+  SUM_SUB          4,    2,    9
+  SUM_SUB          3,    5,    9
+  pmulhrsw        m3,  m12
+  pmulhrsw        m5,  m12
+
+  SUM_SUB          0,    7,    9
+  SUM_SUB          4,    3,    9
+  SUM_SUB          2,    5,    9
+  SUM_SUB          6,    1,    9
+
+  SWAP             3,    6
+  SWAP             1,    4
+%endmacro
+
+; This macro handles 8 pixels per line
+%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero
+  paddw           m%1, m11
+  paddw           m%2, m11
+  psraw           m%1, 5
+  psraw           m%2, 5
+
+  movh            m%3, [outputq]
+  movh            m%4, [outputq + strideq]
+  punpcklbw       m%3, m%5
+  punpcklbw       m%4, m%5
+  paddw           m%3, m%1
+  paddw           m%4, m%2
+  packuswb        m%3, m%5
+  packuswb        m%4, m%5
+  movh               [outputq], m%3
+  movh     [outputq + strideq], m%4
+%endmacro
+
+INIT_XMM ssse3
+; full inverse 8x8 2D-DCT transform
+cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
+  mova     m8, [pd_8192]
+  mova    m11, [pw_16]
+  mova    m12, [pw_11585x2]
+
+  lea      r3, [2 * strideq]
+
+  mova     m0, [inputq +   0]
+  mova     m1, [inputq +  16]
+  mova     m2, [inputq +  32]
+  mova     m3, [inputq +  48]
+  mova     m4, [inputq +  64]
+  mova     m5, [inputq +  80]
+  mova     m6, [inputq +  96]
+  mova     m7, [inputq + 112]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+  IDCT8_1D
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+  IDCT8_1D
+
+  pxor    m12, m12
+  ADD_STORE_8P_2X  0, 1, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  2, 3, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  4, 5, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  6, 7, 9, 10, 12
+
+  RET
+
+; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
+cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
+  mova       m8, [pd_8192]
+  mova      m11, [pw_16]
+  mova      m12, [pw_11585x2]
+
+  lea        r3, [2 * strideq]
+
+  mova       m0, [inputq +  0]
+  mova       m1, [inputq + 16]
+  mova       m2, [inputq + 32]
+  mova       m3, [inputq + 48]
+
+  punpcklwd  m0, m1
+  punpcklwd  m2, m3
+  punpckhdq  m9, m0, m2
+  punpckldq  m0, m2
+  SWAP       2, 9
+
+  ; m0 -> [0], [0]
+  ; m1 -> [1], [1]
+  ; m2 -> [2], [2]
+  ; m3 -> [3], [3]
+  punpckhqdq m10, m0, m0
+  punpcklqdq m0,  m0
+  punpckhqdq m9,  m2, m2
+  punpcklqdq m2,  m2
+  SWAP       1, 10
+  SWAP       3,  9
+
+  pmulhrsw   m0, m12
+  pmulhrsw   m2, [dpw_30274_12540]
+  pmulhrsw   m1, [dpw_6392_32138]
+  pmulhrsw   m3, [dpw_m18204_27246]
+
+  SUM_SUB    0, 2, 9
+  SUM_SUB    1, 3, 9
+
+  punpcklqdq m9, m3, m3
+  punpckhqdq m5, m3, m9
+
+  SUM_SUB    3, 5, 9
+  punpckhqdq m5, m3
+  pmulhrsw   m5, m12
+
+  punpckhqdq m9, m1, m5
+  punpcklqdq m1, m5
+  SWAP       5, 9
+
+  SUM_SUB    0, 5, 9
+  SUM_SUB    2, 1, 9
+
+  punpckhqdq m3, m0, m0
+  punpckhqdq m4, m1, m1
+  punpckhqdq m6, m5, m5
+  punpckhqdq m7, m2, m2
+
+  punpcklwd  m0, m3
+  punpcklwd  m7, m2
+  punpcklwd  m1, m4
+  punpcklwd  m6, m5
+
+  punpckhdq  m4, m0, m7
+  punpckldq  m0, m7
+  punpckhdq  m10, m1, m6
+  punpckldq  m5, m1, m6
+
+  punpckhqdq m1, m0, m5
+  punpcklqdq m0, m5
+  punpckhqdq m3, m4, m10
+  punpcklqdq m2, m4, m10
+
+
+  pmulhrsw   m0, m12
+  pmulhrsw   m6, m2, [dpw_30274_30274]
+  pmulhrsw   m4, m2, [dpw_12540_12540]
+
+  pmulhrsw   m7, m1, [dpw_32138_32138]
+  pmulhrsw   m1, [dpw_6392_6392]
+  pmulhrsw   m5, m3, [dpw_m18204_m18204]
+  pmulhrsw   m3, [dpw_27246_27246]
+
+  mova       m2, m0
+  SUM_SUB    0, 6, 9
+  SUM_SUB    2, 4, 9
+  SUM_SUB    1, 5, 9
+  SUM_SUB    7, 3, 9
+
+  SUM_SUB    3, 5, 9
+  pmulhrsw   m3, m12
+  pmulhrsw   m5, m12
+
+  SUM_SUB    0, 7, 9
+  SUM_SUB    2, 3, 9
+  SUM_SUB    4, 5, 9
+  SUM_SUB    6, 1, 9
+
+  SWAP       3, 6
+  SWAP       1, 2
+  SWAP       2, 4
+
+
+  pxor    m12, m12
+  ADD_STORE_8P_2X  0, 1, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  2, 3, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  4, 5, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  6, 7, 9, 10, 12
+
+  RET
+
+%endif
--- a/vp9/encoder/x86/vp9_dct_ssse3.asm
+++ /dev/null
@@ -1,174 +1,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the forward transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-pd_8192:    times 4 dd 8192
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
-pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
-%endmacro
-
-TRANSFORM_COEFFS 15137,   6270
-TRANSFORM_COEFFS 16069,   3196
-TRANSFORM_COEFFS  9102,  13623
-
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-; 1D forward 8x8 DCT transform
-%macro FDCT8_1D 0
-  SUM_SUB            0,  7,  9
-  SUM_SUB            1,  6,  9
-  SUM_SUB            2,  5,  9
-  SUM_SUB            3,  4,  9
-
-  SUM_SUB            0,  3,  9
-  SUM_SUB            1,  2,  9
-  SUM_SUB            6,  5,  9
-  SUM_SUB            0,  1,  9
-
-  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
-
-  pmulhrsw           m6, m12
-  pmulhrsw           m5, m12
-  pmulhrsw           m0, m12
-  pmulhrsw           m1, m12
-
-  SUM_SUB            4,  5,  9
-  SUM_SUB            7,  6,  9
-  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
-  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
-  SWAP               1,  4
-  SWAP               3,  6
-%endmacro
-
-%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
-  psraw              m%3, m%1, 15
-  psraw              m%4, m%2, 15
-  psubw              m%1, m%3
-  psubw              m%2, m%4
-  psraw              m%1, 1
-  psraw              m%2, 1
-%endmacro
-
-INIT_XMM ssse3
-cglobal fdct8x8, 3, 5, 13, input, output, stride
-
-  mova               m8, [pd_8192]
-  mova              m12, [pw_11585x2]
-  pxor              m11, m11
-
-  lea                r3, [2 * strideq]
-  lea                r4, [4 * strideq]
-  mova               m0, [inputq]
-  mova               m1, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m2, [inputq]
-  mova               m3, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m4, [inputq]
-  mova               m5, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m6, [inputq]
-  mova               m7, [inputq + r3]
-
-  ; left shift by 2 to increase forward transformation precision
-  psllw              m0, 2
-  psllw              m1, 2
-  psllw              m2, 2
-  psllw              m3, 2
-  psllw              m4, 2
-  psllw              m5, 2
-  psllw              m6, 2
-  psllw              m7, 2
-
-  ; column transform
-  FDCT8_1D
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  FDCT8_1D
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  DIVIDE_ROUND_2X   0, 1, 9, 10
-  DIVIDE_ROUND_2X   2, 3, 9, 10
-  DIVIDE_ROUND_2X   4, 5, 9, 10
-  DIVIDE_ROUND_2X   6, 7, 9, 10
-
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
-
-  RET
-%endif
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
@@ -1,0 +1,174 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the forward transformation. Part
+; of the macro definitions are originally derived from the ffmpeg project.
+; The current version applies to x86 64-bit only.
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 15137,   6270
+TRANSFORM_COEFFS 16069,   3196
+TRANSFORM_COEFFS  9102,  13623
+
+SECTION .text
+
+%if ARCH_X86_64
+%macro SUM_SUB 3
+  psubw  m%3, m%1, m%2
+  paddw  m%1, m%2
+  SWAP    %2, %3
+%endmacro
+
+; butterfly operation
+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
+  pmaddwd            m%1, m%3, %5
+  pmaddwd            m%2, m%3, %6
+  paddd              m%1,  %4
+  paddd              m%2,  %4
+  psrad              m%1,  14
+  psrad              m%2,  14
+%endmacro
+
+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
+; matrix transpose
+%macro INTERLEAVE_2X 4
+  punpckh%1          m%4, m%2, m%3
+  punpckl%1          m%2, m%3
+  SWAP               %3,  %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+  INTERLEAVE_2X  wd, %1, %2, %9
+  INTERLEAVE_2X  wd, %3, %4, %9
+  INTERLEAVE_2X  wd, %5, %6, %9
+  INTERLEAVE_2X  wd, %7, %8, %9
+
+  INTERLEAVE_2X  dq, %1, %3, %9
+  INTERLEAVE_2X  dq, %2, %4, %9
+  INTERLEAVE_2X  dq, %5, %7, %9
+  INTERLEAVE_2X  dq, %6, %8, %9
+
+  INTERLEAVE_2X  qdq, %1, %5, %9
+  INTERLEAVE_2X  qdq, %3, %7, %9
+  INTERLEAVE_2X  qdq, %2, %6, %9
+  INTERLEAVE_2X  qdq, %4, %8, %9
+
+  SWAP  %2, %5
+  SWAP  %4, %7
+%endmacro
+
+; 1D forward 8x8 DCT transform
+%macro FDCT8_1D 0
+  SUM_SUB            0,  7,  9
+  SUM_SUB            1,  6,  9
+  SUM_SUB            2,  5,  9
+  SUM_SUB            3,  4,  9
+
+  SUM_SUB            0,  3,  9
+  SUM_SUB            1,  2,  9
+  SUM_SUB            6,  5,  9
+  SUM_SUB            0,  1,  9
+
+  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
+
+  pmulhrsw           m6, m12
+  pmulhrsw           m5, m12
+  pmulhrsw           m0, m12
+  pmulhrsw           m1, m12
+
+  SUM_SUB            4,  5,  9
+  SUM_SUB            7,  6,  9
+  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
+  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
+  SWAP               1,  4
+  SWAP               3,  6
+%endmacro
+
+%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
+  psraw              m%3, m%1, 15
+  psraw              m%4, m%2, 15
+  psubw              m%1, m%3
+  psubw              m%2, m%4
+  psraw              m%1, 1
+  psraw              m%2, 1
+%endmacro
+
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+  mova               m8, [pd_8192]
+  mova              m12, [pw_11585x2]
+  pxor              m11, m11
+
+  lea                r3, [2 * strideq]
+  lea                r4, [4 * strideq]
+  mova               m0, [inputq]
+  mova               m1, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m2, [inputq]
+  mova               m3, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m4, [inputq]
+  mova               m5, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m6, [inputq]
+  mova               m7, [inputq + r3]
+
+  ; left shift by 2 to increase forward transformation precision
+  psllw              m0, 2
+  psllw              m1, 2
+  psllw              m2, 2
+  psllw              m3, 2
+  psllw              m4, 2
+  psllw              m5, 2
+  psllw              m6, 2
+  psllw              m7, 2
+
+  ; column transform
+  FDCT8_1D
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  FDCT8_1D
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  DIVIDE_ROUND_2X   0, 1, 9, 10
+  DIVIDE_ROUND_2X   2, 3, 9, 10
+  DIVIDE_ROUND_2X   4, 5, 9, 10
+  DIVIDE_ROUND_2X   6, 7, 9, 10
+
+  mova              [outputq +   0], m0
+  mova              [outputq +  16], m1
+  mova              [outputq +  32], m2
+  mova              [outputq +  48], m3
+  mova              [outputq +  64], m4
+  mova              [outputq +  80], m5
+  mova              [outputq +  96], m6
+  mova              [outputq + 112], m7
+
+  RET
+%endif
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ /dev/null
@@ -1,219 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
-                                eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  movd                            m4, dword zbin_oqm       ; m4 = zbin_oq
-  mova                            m0, [zbinq]              ; m0 = zbin
-  punpcklwd                       m4, m4
-  mova                            m1, [roundq]             ; m1 = round
-  pshufd                          m4, m4, 0
-  mova                            m2, [quantq]             ; m2 = quant
-  paddw                           m0, m4                   ; m0 = zbin + zbin_oq
-%ifidn %1, b_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m0, m5
-  paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  psubw                           m0, [pw_1]
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-%ifidn %1, b_32x32
-  pmovmskb                        r6, m7
-  pmovmskb                        r2, m12
-  or                              r6, r2
-  jz .skip_iter
-%endif
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-%ifidn %1, b_32x32
-  jmp .accumulate_eob
-.skip_iter:
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-%endif
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                             [r2], r6
-  RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                    word [eobq], 0
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
--- /dev/null
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -1,0 +1,219 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  movd                            m4, dword zbin_oqm       ; m4 = zbin_oq
+  mova                            m0, [zbinq]              ; m0 = zbin
+  punpcklwd                       m4, m4
+  mova                            m1, [roundq]             ; m1 = round
+  pshufd                          m4, m4, 0
+  mova                            m2, [quantq]             ; m2 = quant
+  paddw                           m0, m4                   ; m0 = zbin + zbin_oq
+%ifidn %1, b_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  psubw                           m0, [pw_1]
+  mov                             r2, shiftmp
+  mov                             r3, qcoeffmp
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+  mova        [qcoeffq+ncoeffq*2+ 0], m8
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m8
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  pmovmskb                        r6, m7
+  pmovmskb                        r2, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+  mova        [qcoeffq+ncoeffq*2+ 0], m14
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m14
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*2+ 0], m5
+  mova        [qcoeffq+ncoeffq*2+16], m5
+  mova       [dqcoeffq+ncoeffq*2+ 0], m5
+  mova       [dqcoeffq+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova       [dqcoeffq+ncoeffq*2+ 0], m7
+  mova       [dqcoeffq+ncoeffq*2+16], m7
+  mova        [qcoeffq+ncoeffq*2+ 0], m7
+  mova        [qcoeffq+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                    word [eobq], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 7
+QUANTIZE_FN b_32x32, 7
--- a/vp9/encoder/x86/vp9_ssim_opt.asm
+++ /dev/null
@@ -1,216 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
-%macro TABULATE_SSIM 0
-        paddusw         xmm15, xmm3  ; sum_s
-        paddusw         xmm14, xmm4  ; sum_r
-        movdqa          xmm1, xmm3
-        pmaddwd         xmm1, xmm1
-        paddd           xmm13, xmm1 ; sum_sq_s
-        movdqa          xmm2, xmm4
-        pmaddwd         xmm2, xmm2
-        paddd           xmm12, xmm2 ; sum_sq_r
-        pmaddwd         xmm3, xmm4
-        paddd           xmm11, xmm3  ; sum_sxr
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_Q 1
-        movdqa          xmm2,%1
-        punpckldq       %1,xmm0
-        punpckhdq       xmm2,xmm0
-        paddq           %1,xmm2
-        movdqa          xmm2,%1
-        punpcklqdq      %1,xmm0
-        punpckhqdq      xmm2,xmm0
-        paddq           %1,xmm2
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_W 1
-        movdqa          xmm1, %1
-        punpcklwd       %1,xmm0
-        punpckhwd       xmm1,xmm0
-        paddd           %1, xmm1
-        SUM_ACROSS_Q    %1
-%endmacro
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    unsigned long *sum_s,
-;    unsigned long *sum_r,
-;    unsigned long *sum_sq_s,
-;    unsigned long *sum_sq_r,
-;    unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp9_ssim_parms_16x16_sse2) PRIVATE
-sym(vp9_ssim_parms_16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 16      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movdqu          xmm5, [rsi]
-    movdqu          xmm6, [rdi]
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpckhbw       xmm3, xmm0 ; high_s
-    punpckhbw       xmm4, xmm0 ; high_r
-
-    TABULATE_SSIM
-
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    unsigned long *sum_s,
-;    unsigned long *sum_r,
-;    unsigned long *sum_sq_s,
-;    unsigned long *sum_sq_r,
-;    unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp9_ssim_parms_8x8_sse2) PRIVATE
-sym(vp9_ssim_parms_8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 8      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movq            xmm3, [rsi]
-    movq            xmm4, [rdi]
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- /dev/null
+++ b/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm
@@ -1,0 +1,216 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp9_ssim_parms_16x16_sse2) PRIVATE
+sym(vp9_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp9_ssim_parms_8x8_sse2) PRIVATE
+sym(vp9_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -121,7 +121,7 @@
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
 ifeq ($(ARCH_X86_64), yes)
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3.asm
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
 endif
 
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve_neon.c
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -112,12 +112,12 @@
 endif
 
 ifeq ($(ARCH_X86_64),yes)
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.asm
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm
 endif
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
-VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
+VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c