ref: 240a5a15ef4fecbc13f0c88394feab4facc94133
parent: cd94d5f68e269814591f4decae60da271a27ca3c
parent: 8394990b2749608ea710a9fbfe82bb4bba1529c9
author: Johann Koenig <[email protected]>
date: Tue May 2 10:16:47 EDT 2017
Merge "block error sse2: sum in 32 bits when possible"
--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@@ -39,23 +39,18 @@
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
+ paddd m2, m3
; accumulate in 64bit
punpckldq m7, m0, m5
punpckhdq m0, m5
paddq m4, m7
- punpckldq m7, m1, m5
- paddq m4, m0
- punpckhdq m1, m5
- paddq m4, m7
punpckldq m7, m2, m5
- paddq m4, m1
+ paddq m4, m0
punpckhdq m2, m5
paddq m6, m7
- punpckldq m7, m3, m5
paddq m6, m2
- punpckhdq m3, m5
- paddq m6, m7
- paddq m6, m3
jg .loop
; accumulate horizontally and store in return value
@@ -98,15 +93,13 @@
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
pmaddwd m0, m0
pmaddwd m1, m1
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
; accumulate in 64bit
punpckldq m3, m0, m5
punpckhdq m0, m5
paddq m4, m3
- punpckldq m3, m1, m5
paddq m4, m0
- punpckhdq m1, m5
- paddq m4, m3
- paddq m4, m1
jnz .loop
; accumulate horizontally and store in return value