shithub: libvpx

Download patch

ref: 8394990b2749608ea710a9fbfe82bb4bba1529c9
parent: ef5918098d5c7f8ffda960274e3f8e38f02cb487
author: Kyle Siefring <[email protected]>
date: Mon May 1 05:15:29 EDT 2017

block error sse2: sum in 32 bits when possible

Add 31bit pairs before unpacking in x86 block error code

BUG=webm:1210

Change-Id: I5ca8c7f7775585a17fe09d6bbfc25e1f2955eb0a

--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@@ -39,23 +39,18 @@
   pmaddwd   m1, m1
   pmaddwd   m2, m2
   pmaddwd   m3, m3
+  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+  paddd     m0, m1
+  paddd     m2, m3
   ; accumulate in 64bit
   punpckldq m7, m0, m5
   punpckhdq m0, m5
   paddq     m4, m7
-  punpckldq m7, m1, m5
-  paddq     m4, m0
-  punpckhdq m1, m5
-  paddq     m4, m7
   punpckldq m7, m2, m5
-  paddq     m4, m1
+  paddq     m4, m0
   punpckhdq m2, m5
   paddq     m6, m7
-  punpckldq m7, m3, m5
   paddq     m6, m2
-  punpckhdq m3, m5
-  paddq     m6, m7
-  paddq     m6, m3
   jg .loop
 
   ; accumulate horizontally and store in return value
@@ -98,15 +93,13 @@
   ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
   pmaddwd   m0, m0
   pmaddwd   m1, m1
+  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+  paddd     m0, m1
   ; accumulate in 64bit
   punpckldq m3, m0, m5
   punpckhdq m0, m5
   paddq     m4, m3
-  punpckldq m3, m1, m5
   paddq     m4, m0
-  punpckhdq m1, m5
-  paddq     m4, m3
-  paddq     m4, m1
   jnz .loop
 
   ; accumulate horizontally and store in return value